library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(dplyr)
USIDNET_reducida_04a<-readRDS(paste0("data/","USIDNET_reducida_04a",".rds"))
USIDNET_reducida_04a
# lista_data_acc<-list()
USIDNET_reducida_05<-as.data.frame(USIDNET_reducida_04a)
USIDNET_reducida_05$dx<-as.numeric(as.factor(USIDNET_reducida_05$Category))
library(caret)
prop_extraer_base<-.75
set.seed(prop_extraer_base*100)
inTrain <- createDataPartition(y = USIDNET_reducida_05$dx,
## the outcome data are needed
p = prop_extraer_base,#prop_para_partition_malos,
## The percentage of data in the
## training set
list = FALSE)
pob00_train0 <- USIDNET_reducida_05[ inTrain,]
pob00_test0 <- USIDNET_reducida_05[ -inTrain,]
pob00_train0%>%as_data_frame(); pob00_test0%>%as_data_frame()
#names(pob00_train0)
source("utils/descenso_gradiente_bbva.R")
x_ent <- pob00_train0 %>%
select( -id_px, -Category,-dx
# one_of(
# unique(subgb02_data01$VARIABLE_NUEVA_CLASIFICACION1[subgb02_data01$VARIABLE_NUEVA_CLASIFICACION1%in%names(USIDNET_reducida_01)]))
# )%>%
) %>%
as.matrix
y_ent <- pob00_train0$dx
x_ent_s <- scale(x_ent)
medias <- attr(x_ent_s, 'scaled:center')
sd <- attr(x_ent_s, 'scaled:scale')
x_ent%>%as_data_frame();x_ent_s%>%as_data_frame()
x_pr <- pob00_test0 %>%
select( -id_px, -Category,-dx
# one_of(
# unique(subgb02_data01$VARIABLE_NUEVA_CLASIFICACION1[subgb02_data01$VARIABLE_NUEVA_CLASIFICACION1%in%names(USIDNET_reducida_01)]))
# )%>%
) %>%
as.matrix
y_pr <- pob00_test0$dx
x_pr%>%as_data_frame()
p<-ncol(x_ent)
K<-length(unique(USIDNET_reducida_05$dx))
# dev_ent <- devianza_calc(x = x_ent_s,y = y_ent)
# grad <- grad_calc(x_ent = x_ent_s, y_ent)
dev_ent <- devianza_calc(x = x_ent_s,y = y_ent)
grad <- grad_calc(x_ent = x_ent_s, y_ent)
iteraciones05 <- descenso(5001,rep(0, (p+1)*(K-1)), eta=0.0001,
h_deriv = grad, dev_fun = dev_ent)
[1] "iteration:100 - betas from 101"
[1] "iteration:200 - betas from 201"
[1] "iteration:300 - betas from 301"
[1] "iteration:400 - betas from 401"
[1] "iteration:500 - betas from 501"
[1] "iteration:600 - betas from 601"
[1] "iteration:700 - betas from 701"
[1] "iteration:800 - betas from 801"
[1] "iteration:900 - betas from 901"
[1] "iteration:1000 - betas from 1001"
[1] "iteration:1100 - betas from 1101"
[1] "iteration:1200 - betas from 1201"
[1] "iteration:1300 - betas from 1301"
[1] "iteration:1400 - betas from 1401"
[1] "iteration:1500 - betas from 1501"
[1] "iteration:1600 - betas from 1601"
[1] "iteration:1700 - betas from 1701"
[1] "iteration:1800 - betas from 1801"
[1] "iteration:1900 - betas from 1901"
[1] "iteration:2000 - betas from 2001"
[1] "iteration:2100 - betas from 2101"
[1] "iteration:2200 - betas from 2201"
[1] "iteration:2300 - betas from 2301"
[1] "iteration:2400 - betas from 2401"
[1] "iteration:2500 - betas from 2501"
[1] "iteration:2600 - betas from 2601"
[1] "iteration:2700 - betas from 2701"
[1] "iteration:2800 - betas from 2801"
[1] "iteration:2900 - betas from 2901"
[1] "iteration:3000 - betas from 3001"
[1] "iteration:3100 - betas from 3101"
[1] "iteration:3200 - betas from 3201"
[1] "iteration:3300 - betas from 3301"
[1] "iteration:3400 - betas from 3401"
[1] "iteration:3500 - betas from 3501"
[1] "iteration:3600 - betas from 3601"
[1] "iteration:3700 - betas from 3701"
[1] "iteration:3800 - betas from 3801"
[1] "iteration:3900 - betas from 3901"
[1] "iteration:4000 - betas from 4001"
[1] "iteration:4100 - betas from 4101"
[1] "iteration:4200 - betas from 4201"
[1] "iteration:4300 - betas from 4301"
[1] "iteration:4400 - betas from 4401"
[1] "iteration:4500 - betas from 4501"
[1] "iteration:4600 - betas from 4601"
[1] "iteration:4700 - betas from 4701"
[1] "iteration:4800 - betas from 4801"
[1] "iteration:4900 - betas from 4901"
[1] "iteration:5000 - betas from 5001"
iteraciones<-iteraciones05
devianzas_iteraciones<-sapply(1:nrow(iteraciones),function(i) dev_ent(iteraciones[i,]))
df_devianzas_iteraciones<-data.frame(
id=1:nrow(iteraciones),
devianzas=devianzas_iteraciones
)
saveRDS(iteraciones,"data/iteraciones_5001_0s_0.0001_copy.rds")
# iteraciones<-readRDS("data/iteraciones_5001_0s_0.0001_copy.rds")
p<-ncol(x_ent)
K<-length(unique(USIDNET_reducida_05$dx))
dev_ent <- devianza_calc(x = x_ent_s,y = y_ent)
grad <- grad_calc(x_ent = x_ent_s, y_ent)
devianzas_iteraciones<-sapply(1:nrow(iteraciones),function(i) dev_ent(iteraciones[i,]))
df_devianzas_iteraciones<-data.frame(
id=1:nrow(iteraciones),
deviances=devianzas_iteraciones
)
df_devianzas_iteraciones
lista_data_acc<-list()
top5<-head(df_devianzas_iteraciones%>%arrange(deviances))
data_acc<-data_frame()
for(id_top in 1: nrow(top5)){
id_mindev<-top5[id_top,1]
print(paste0("-------->>>> id: ",id_top,"<<<<--------"))
print(id_mindev)
print("deviance:")
print(dev_ent(iteraciones[id_mindev,]))
probas <- pred_multinom(x_ent_s, iteraciones[id_mindev,])
clase <- apply(probas, 1, which.max)
print("train:")
#print(table(clase, y_ent ))
acc_train<-1 - mean(clase != y_ent)
print(acc_train)
x_pr_s <- scale(x_pr, center = medias, scale = sd)
probas <- pred_multinom(x_pr_s, iteraciones[id_mindev,])
clase <- apply(probas, 1, which.max)
print("test:")
#print(table(clase, y_pr ))
acc_test<-1 - mean(clase != y_pr)
print(acc_test)
data_acc<-data_acc%>%
bind_rows(
data_frame(
id=id_mindev,
dev_train=dev_ent(iteraciones[id_mindev,]),
acc_train=acc_train,
acc_test=acc_test
)
)
}
[1] "-------->>>> id: 1<<<<--------"
[1] 4840
[1] "deviance:"
[1] 4298.162
[1] "train:"
[1] 0.6562848
[1] "test:"
[1] 0.5401338
[1] "-------->>>> id: 2<<<<--------"
[1] 4839
[1] "deviance:"
[1] 4378.981
[1] "train:"
[1] 0.6779755
[1] "test:"
[1] 0.5618729
[1] "-------->>>> id: 3<<<<--------"
[1] 4273
[1] "deviance:"
[1] 4428.902
[1] "train:"
[1] 0.6451613
[1] "test:"
[1] 0.5183946
[1] "-------->>>> id: 4<<<<--------"
[1] 4272
[1] "deviance:"
[1] 4488.603
[1] "train:"
[1] 0.6457175
[1] "test:"
[1] 0.5284281
[1] "-------->>>> id: 5<<<<--------"
[1] 4614
[1] "deviance:"
[1] 4503.951
[1] "train:"
[1] 0.6490545
[1] "test:"
[1] 0.5401338
[1] "-------->>>> id: 6<<<<--------"
[1] 4810
[1] "deviance:"
[1] 4521.62
[1] "train:"
[1] 0.6529477
[1] "test:"
[1] 0.5334448
# idmin<-data_acc%>%
# filter(acc_test==max(acc_test))%>%
# filter(acc_train==max(acc_train))%>%
# filter(id==min(id))%>%
# pull(id)
idmin<-data_acc%>%
filter(dev_train==min(dev_train))%>%
filter(acc_test==max(acc_test))%>%
filter(acc_train==max(acc_train))%>%
filter(id==min(id))%>%
pull(id)
print("--------------------------------------------")
[1] "--------------------------------------------"
print("--------------- BEST RESULT --------------")
[1] "--------------- BEST RESULT --------------"
print("-------------- A C C U R A C Y --------------")
[1] "-------------- A C C U R A C Y --------------"
probas <- pred_multinom(x_ent_s, iteraciones[idmin,])
clase <- apply(probas, 1, which.max)
print("train:")
[1] "train:"
table_train<-table(clase, y_ent)
accuracy_train<-1 - mean(clase != y_ent)
print(accuracy_train)
[1] 0.6562848
print("
...
")
[1] "\n...\n "
x_pr_s <- scale(x_pr, center = medias, scale = sd)
probas <- pred_multinom(x_pr_s, iteraciones[idmin,])
clase <- apply(probas, 1, which.max)
print("test:")
[1] "test:"
table_test<-table(clase, y_pr )
accuracy_test<-1 - mean(clase != y_pr)
print(accuracy_test)
[1] 0.5401338
mean(table_test)
[1] 4.152778
# lista_data_acc[[paste0("init",length(lista_data_acc))]]<-list(
lista_data_acc[["usidnet4a"]]<-list(
"data_acc"=data_acc,
"idmin"=idmin,
"table_train"=table_train,
"table_test"=table_test,
"accuracy_train"=accuracy_train,
"accuracy_test"=accuracy_test
)
print("
...
")
[1] "\n...\n "
print("--------------------------------------------")
[1] "--------------------------------------------"
print("-------------- CONFUSION MATRIX -------------")
[1] "-------------- CONFUSION MATRIX -------------"
print("train:")
[1] "train:"
lista_data_acc$usidnet4a$table_train;
y_ent
clase 1 2 3 4 5 6 7 8 9 10 11 12
1 23 4 0 0 2 26 5 1 0 0 2 1
2 3 48 1 0 2 26 5 1 0 0 5 1
3 4 1 95 0 5 19 2 0 0 0 4 2
4 0 0 0 5 0 2 0 0 0 0 1 1
5 2 2 9 0 68 25 5 0 0 1 10 1
6 48 23 7 4 17 581 31 14 0 3 31 8
7 4 13 5 0 5 97 251 4 0 0 29 13
8 2 2 1 0 0 10 1 12 0 0 1 0
9 0 0 0 0 0 0 0 0 3 0 0 0
10 1 1 0 0 2 0 0 0 0 15 3 0
11 0 1 0 1 0 11 2 1 0 0 58 2
12 1 3 1 0 4 19 3 0 0 0 8 21
print("test:")
[1] "test:"
lista_data_acc$usidnet4a$table_test;
y_pr
clase 1 2 3 4 5 6 7 8 9 10 11 12
1 4 2 1 0 0 8 1 0 1 0 5 0
2 5 12 1 0 0 16 3 0 0 0 1 0
3 0 3 19 0 6 4 0 0 1 0 5 0
4 0 0 0 0 0 0 0 1 0 0 0 0
5 1 0 0 0 11 11 1 1 0 1 1 0
6 14 13 7 2 3 183 12 4 1 3 17 6
7 1 4 5 0 2 38 81 3 0 0 6 6
8 1 0 0 0 0 8 0 1 0 1 3 0
9 0 0 0 0 0 2 0 0 0 0 0 0
10 0 1 2 0 3 1 0 1 0 1 0 0
11 3 1 0 0 0 5 2 2 0 0 10 0
12 0 1 0 0 2 6 1 0 0 0 2 1
saveRDS(lista_data_acc,"data/lista_data_acc_usidnet4a_copy.rds")
data_acc<-lista_data_acc$usidnet4a$data_acc
# idmin<-data_acc%>%
# filter(acc_test==max(acc_test))%>%
# filter(acc_train==max(acc_train))%>%
# filter(id==min(id))%>%
# pull(id)
idmin<-data_acc%>%
filter(dev_train==min(dev_train))%>%
filter(id==min(id))%>%
pull(id)
betas<-iteraciones[idmin,]
# betas
#p;K;
#(p+1)*(K-1);
#length(betas)
df_betas <- as_data_frame(matrix(betas, K - 1, p + 1 , byrow = TRUE))%>%
bind_rows(
as_data_frame(matrix(c(1,rep(0,p)),nrow=1))
)%>%
mutate(
dx=as.character(row_number())
)%>%
left_join(
USIDNET_reducida_05%>%
group_by(dx,Category)%>%
summarise(
n=n()
)%>%
ungroup()%>%
mutate(
dx=as.character(dx),
prop=round(n/sum(n),3)
)
)%>%
select(dx,Category,n,prop,one_of(names(.)))
Joining, by = "dx"
names(df_betas)<-c("dx","Category","n","prop",paste0("beta_",(seq(p+1)-1)))
df_betas
# prod_matrices<-as.matrix(cbind(1, x)) %*% t(beta_mat)
excluir<-c("dx","Category","n","prop")
ptsig00<-as.data.frame(
pob00_train0%>%
# select( -id_px, -Category, -dx)%>%
select( -id_px, -Category)%>%
mutate(intercept=1)%>%
select(dx,intercept, one_of(names(.)))
)#[c("malos","denomsy",names(siestan)[!names(siestan)%in%excluir])]
lista_resultados<-list()
tbl_pesos<-data_frame()
for(clase_i in 1:K){
# clase_i<-1
print(clase_i)
dx_tmp<-pob00_train0%>%filter(dx==clase_i)%>%distinct(Category)%>%pull(Category)
n_tmp<-df_betas%>%filter(dx==clase_i)%>%distinct(n)%>%pull(n)
prop_tmp<-df_betas%>%filter(dx==clase_i)%>%distinct(prop)%>%pull(prop)
print(dx_tmp)
siestan<-as.data.frame(df_betas)[clase_i,]
# ptsig00<-pob_test[c("malos","denomsy",names(siestan)[!names(siestan)%in%excluir])]
# ptsig<-as.data.frame(apply(ptsig00,2,as.numeric))
ptsig<-as.data.frame(apply(ptsig00,2,as.numeric))
varTi<-0
# data_vars<-as.data.frame(ptsig[1,names(siestan)[!names(siestan)%in%excluir][-1]])
data_vars<-as.data.frame(ptsig[1,names(ptsig00)[-c(1:2)]])
names(data_vars)<-names(ptsig00)[-c(1:2)]
data_vars[1,]<-0
data_vars[2,]<-0
# data_vars[3,]<-0
for(j in 1:(length(names(ptsig))-2)){
# j<-1
#print(names(ptsig)[j+2])
#print(names(siestan[!names(siestan)%in%excluir][1+j]))
betawoe<-ptsig[,2+j]*as.numeric(siestan[!names(siestan)%in%excluir][1+j])
ptsig<-cbind(ptsig,betawoe)
varj<-round(sd(ptsig$betawoe),4)
# woe_var<-names(siestan)[!names(siestan)%in%excluir][j+1]
# var_original<-gsub("woe_","",woe_var)
# mtr3_tmp<-MTR3_yk(ptsig[,3+j],ptsig$malos,ptsig$denomsy)
# gini<-as.data.frame(
# df_ginis%>%
# filter(var==woe_var)
# )$Gini #unique(lmtr5$lista$woes_nesp$Gini[lmtr5$lista$woes_nesp$var==var_tmp])
# mtr4_tmp<-MTR4_yk(ptsig,"woe_IM_MEDIO_PAGO_TDC_6M",vobj = "malos",denomsy = ptsig$denomsy)
data_vars[1,names(ptsig)[j+2]]<-names(siestan[!names(siestan)%in%excluir][1+j])
data_vars[2,names(ptsig)[j+2]]<-varj
# data_vars[3,names(ptsig)[j+2]]<-names(ptsig)[j+2]
# data_vars[3,names(siestan)[!names(siestan)%in%excluir][1+j]]<-gini #max(mtr3_tmp$ga,mtr3_tmp$gd)
varTi<-varTi + varj
names(ptsig)[ncol(ptsig)]<-paste("beta",gsub(" ","_",names(ptsig)[2+j]),sep="_")
}
data_vars$varTi<-varTi
# data_vars$varTi[3]<-0
pesos<-round(as.numeric(data_vars[2,])/data_vars$varTi[2],4)
data_vars<-rbind(data_vars,pesos)
row.names(data_vars)<-c("id_beta","desviacion_s","pesos")#"nombre_original","pesos")
#print(data_vars)
data_vars<-as.data.frame(t(data_vars))
#print(data_vars)
#print(length(t(siestan[!names(siestan)%in%excluir][-1])))
#blabla<-t(siestan[!names(siestan)%in%excluir][-1])
#print(blabla)
data_vars$beta_value <- c(t(siestan[!names(siestan)%in%excluir][-1]),-9999)
data_vars$clase_i<-clase_i
data_vars$Diagnostico<-dx_tmp
data_vars$num_casos_dx<-n_tmp
data_vars$proporcion_casos_dx<-prop_tmp
data_vars$variable<-row.names(data_vars)
# data_vars<-data_vars[,c(6,1:5)]
data_vars<-data_vars%>%
select(Diagnostico,clase_i,num_casos_dx,proporcion_casos_dx,variable,id_beta,beta_value,desviacion_s,pesos)
tbl_pesos<-rbind(tbl_pesos,data_vars)
}
[1] 1
[1] "AB Deficiency"
[1] 2
[1] "AGAMMA"
[1] 3
[1] "CGD"
[1] 4
[1] "COMPDEF"
[1] 5
[1] "CORE"
[1] 6
[1] "CVID"
[1] 7
[1] "DGS"
[1] 8
[1] "HIGM"
[1] 9
[1] "LAD"
[1] 10
[1] "NEMO"
[1] 11
[1] "SCID"
[1] 12
[1] "WAS"
tbl_pesos<-tbl_pesos%>%
mutate(
variable=gsub('Bajo','low',variable),
variable=gsub('Alto','high',variable),
variable=gsub('LINFOCITOS','Lymphocytes',variable),
variable=gsub('LEUCOCITOS','Leukocytes',variable),
variable=gsub('MONOCITOS','Monocytes',variable),
variable=gsub('NEUTROFILOS','Neutrophils',variable),
variable=gsub('PLAQUETAS','Plateletes',variable),
variable=gsub('EOSINOFILOS','Eosinophils',variable),
variable=gsub('Dolor_abdominal','I_Abdominal_pain',variable),
variable=gsub('Reflujo_gastroesofagico','I_Gastroesophageal_reflux',variable),
variable=gsub('Dolor_toracico','I_Thoracic_pain',variable),
variable=gsub('Dolor','I_Pain',variable),
variable=gsub('Alto','high',variable)
)
tbl_pesos%>%
select(Diagnostico,num_casos_dx,proporcion_casos_dx,variable,beta_value,pesos)%>%
rename(Dx = Diagnostico)%>%
rename(number_cases_per_dx = num_casos_dx)%>%
rename(percentage_cases_per_dx = proporcion_casos_dx)%>%
rename(feature = variable)%>%
rename(weight_importance = pesos)
tbl_pesos_01%>%
group_by(Diagnostico)%>%
arrange(Diagnostico,desc(pesos))%>%
mutate(
nivel_pesos=row_number()
)%>%
filter(nivel_pesos<=5)%>%
ungroup()%>%
arrange(desc(proporcion_casos_dx),Diagnostico,desc(pesos))%>%
select(Diagnostico,variable,beta_value,pesos)%>%
rename(Dx = Diagnostico)%>%
rename(feature = variable)%>%
rename(weight_importance = pesos)
54% in test sample), it’s been decided to try with another Machine Learning Technique such as XGBoost